In [8]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
In [17]:
import numpy as np
import pandas as pd
import pylab as plt
import matplotlib
import seaborn as sns
%matplotlib inline
pd.__version__
Out[17]:
In [10]:
values = [5,3,4,8,2,9]
vals = pd.Series(values)
vals
Out[10]:
In [11]:
vals.index
Out[11]:
In [12]:
vals.values
Out[12]:
In [13]:
vals * 2.5
Out[13]:
In [14]:
vals2 = pd.Series(values, index=['tom','sally','jeff','george','pablo','florence'])
vals2
Out[14]:
In [18]:
vals2[['florence','tom']]
Out[18]:
In [19]:
vals2[['florence','tom','kate']]
Out[19]:
In [20]:
vals3 = vals2[['tom','sally','pablo','florence','ricky','katrin']]
vals3
Out[20]:
In [22]:
vals3.dropna()
Out[22]:
In [23]:
vals3.fillna(0)
Out[23]:
In [24]:
vals3.fillna(vals3.mean())
Out[24]:
In [25]:
vals3.fillna(method='ffill')
Out[25]:
In [26]:
vals3.describe()
Out[26]:
In [28]:
vals.index=pd.Index(['tom','sally','pablo','florence','ricky','katrin'])
vals3=vals3[['tom','sally','pablo','florence','billy','katrin']]
In [29]:
# create a dataframe
dat = pd.DataFrame({'orig':vals,'new':vals3})
dat
Out[29]:
In [30]:
dat.isnull()
Out[30]:
In [31]:
dat.dropna()
Out[31]:
In [33]:
hipster = pd.read_csv('hipster.csv')
hipster[:10]
Out[33]:
Set the index to a datetime
In [34]:
hipster = hipster.set_index(pd.DatetimeIndex(hipster.pop('Date')))
hipster[:10]
Out[34]:
Now load the anti-Hipster data
In [36]:
not_hipster = pd.read_csv('negative_hipster.csv')
not_hipster = not_hipster.set_index(pd.DatetimeIndex(not_hipster.pop('Date')))
In [37]:
not_hipster[:10]
Out[37]:
Check the values of one column
In [38]:
hipster.hipster.head()
Out[38]:
Check another, but get them as an numpy.ndarray
In [39]:
hipster['gumtree perth'].values[:20]
Out[39]:
View the data types, they don't need to be homogenous
In [40]:
hipster.dtypes
Out[40]:
Joins on indexes are easy!
In [41]:
trend = hipster.join(not_hipster, how='inner')
trend.head()
Out[41]:
We can check the column names and values
In [42]:
trend.columns
Out[42]:
In [43]:
trend.values
Out[43]:
Filtering on date ranges is simple
In [44]:
trend['2012-01-01':].head()
Out[44]:
In [45]:
trend['2012-01-01': '2013-01-01'].tail(3)
Out[45]:
We can also grab a single date, or a subset of columns
In [46]:
trend.ix['2012-01-01', ['hipster', 'modcloth']]
Out[46]:
Or do some boolean filtering
In [48]:
trend[trend.melway < 0].head()
Out[48]:
Plotting is built in and easier for dates than matplotlib
In [49]:
_ = trend.plot(figsize=(10, 6))
_ = plt.legend(loc='best', ncol=2)
We can also do it for a single column
In [50]:
_ = trend.hipster.cumsum().plot()
Or split the columns out to subplots
In [60]:
axs = trend.plot(subplots=True, figsize=(10, 10))
Resampling data is also straight forward.
In [52]:
# resample by month
trend.resample('M', how='mean').head()
Out[52]:
but one can do business day, week, month, quarter, annual and a bunch of others
In [67]:
trend.resample('M', how='mean').hipster.dropna().plot()
Out[67]:
Other fancy plots include a scatter matrix including a kernel density estimation (KDE)
In [68]:
# look at the relations
_ = pd.scatter_matrix(trend, figsize=(12,8), diagonal='kde')
In [69]:
df = pd.read_csv('train.csv', header=0)
In [70]:
df.head()
Out[70]:
Lets look at the data types here (this time they're heterogeneous)
In [71]:
df.dtypes
Out[71]:
We can also get a more verbose summary
In [72]:
df.info()
DataFrames can be grouped, like in SQL (it sucked to be a young male on the titanic)
In [73]:
df_grouped = df.groupby(['Pclass', 'Sex'])
In [74]:
df_grouped[['Age', 'Survived']].mean()
Out[74]:
Histograms are straightforward
In [75]:
ax = df['Age'].dropna().hist(bins=20, range=(0,100), alpha = .5)
ax.set_xlabel('Age')
ax.set_ylabel('Passenger Count')
Out[75]:
So are boxplots
In [101]:
bp = df.boxplot(column='Age', by='Pclass', grid=False)
# for i in set(df.Pclass):
# y = df.Age[df.Pclass==i].dropna()
# # Add some random "jitter" to the x-axis
# x = np.random.normal(i, 0.04, size=len(y))
# plt.plot(x, y, 'r.', alpha=0.2)
If we want to do some learning on this data.. lets convert gender to a binary numeric
In [77]:
df['isFemale'] = df['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
df[['Sex','isFemale']].head()
Out[77]:
Find non-numeric columns so we can drop them later
In [78]:
drop_cols = df.columns[df.dtypes.map(lambda x: x=='object')]
drop_cols
Out[78]:
In [79]:
df.info()
Setup our data to learn from
In [80]:
X = pd.DataFrame(df[[c for c in df.columns if c != 'Survived']])
X = X.drop(drop_cols, axis=1)
X = X.drop('PassengerId', axis=1)
y = df.Survived
print X.head()
Have a quick look at the class distribution
In [81]:
y.groupby(y.values).count()
Out[81]:
and fill in some NaNs for age
In [82]:
X['Age'] = X.Age.fillna(X.Age.median())
Prediction with scikit-learn is easy - who will survive?
In [83]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score as acc
In [84]:
# create our classifier
clf = LogisticRegression()
# fit it to the data
clf.fit(X, y)
# and predict
preds = clf.predict(X)
res_acc = acc(y, preds)
print 'Accuracy Score: {:.2f}'.format(res_acc)
print 'Not too bad'
In [85]:
from sklearn.cross_validation import KFold
In [86]:
cv = KFold(n=len(y), n_folds=5, shuffle=True)
preds = np.zeros_like(y)
for train, test in cv:
clf = LogisticRegression()
clf.fit(X.ix[train], y.ix[train])
preds[test] = clf.predict(X.ix[test])
res_acc = acc(y, preds)
print 'Accuracy Score: {:.2f}'.format(res_acc)
And cross-validation can be done more easily
In [87]:
# scikits can actually take care of this for us
from sklearn.cross_validation import cross_val_score
# here
clf = LogisticRegression()
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
# to here
print scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
In [88]:
df.Embarked.head()
Out[88]:
In [89]:
set(df.Embarked.fillna('O'))
Out[89]:
Use the LabelEncoder
In [90]:
from sklearn import preprocessing
df.Embarked = df.Embarked.fillna('O')
le = preprocessing.LabelEncoder()
le.fit(df.Embarked.values)
le.classes_
Out[90]:
In [91]:
X['Embarked'] = le.transform(df.Embarked.values)
X.Embarked.head()
Out[91]:
In [94]:
for C in [0.001, 0.01, 0.1, 1, 10, 100]:
clf = LogisticRegression(C=C, penalty='l1')
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
print("n_estimators: {:3.3f}\tAccuracy: {:.2f} (+/- {:.2f})"
.format(C, scores.mean(), scores.std() * 2))
In [95]:
# normalise the data
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)
In [96]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.lda import LDA
from sklearn.qda import QDA
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
"Random Forest", "AdaBoost", "Naive Bayes", "LDA",
"QDA", "Logistic Regression"]
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
DecisionTreeClassifier(),
RandomForestClassifier(),
AdaBoostClassifier(),
GaussianNB(),
LDA(),
QDA(),
LogisticRegression(class_weight='auto')]
In [97]:
# fit each classifier and find the mean performance
res = []
for name, clf in zip(names, classifiers):
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
res.append(scores.mean())
In [98]:
import prettyplotlib as ppl
res = np.array(res)
names = np.array(names)
idx = np.argsort(res)[::-1]
fig, ax = plt.subplots(1, figsize=(14, 6))
ppl.bar(ax, np.arange(len(res)), res[idx], annotate=True,
xticklabels=names[idx], grid='y')
plt.xticks(rotation=30)
_ = ax.set_ylim(res.min() * 0.95, res.max() * 1.05)
Models can be pickled
In [ ]:
# models can be saved
import pickle
s = pickle.dumps(clf)
by Andreas Mueller